load libraries

library(survival)
library(survminer)
library(cgdsr)
library(sparklyr)
library(dplyr)

R Session Processing

Load clinical Data

cgds <- cgdsr::CGDS("http://www.cbioportal.org/public-portal/")
#Studies<- cgdsr::getCancerStudies(cgds)
clinicalData <- cgdsr::getClinicalData(cgds, "gbm_tcga_pub_all")
#clinicalData <- read.csv("Clinical_tab.csv") #, na.strings=c("","NA")

Transformations 1

clinicalData$OS_STATUS <- gsub("LIVING", "0", clinicalData$OS_STATUS, ignore.case = TRUE)
clinicalData$OS_STATUS <- gsub("DECEASED", "1", clinicalData$OS_STATUS, ignore.case = TRUE)
clinicalData$DFS_STATUS <- gsub("^$|^ $", "DiseaseFree", clinicalData$DFS_STATUS, ignore.case = TRUE)
clinicalData$OS_STATUS <- as.numeric(clinicalData$OS_STATUS)

survival plot

fit <- survival::survfit(Surv(OS_MONTHS, OS_STATUS) ~ DFS_STATUS, data = clinicalData)
   survminer::ggsurvplot(fit, data = clinicalData,
                          type = "kaplan-meier",
                          #conf.type="log",
                          conf.int = TRUE,
                          pval = TRUE,
                          fun = "pct",
                          risk.table = TRUE,
                          size = 1,
                          linetype = "strata",
                          palette = c("#E7B800", "#2E9FDF"),
                          legend = "top",
                          lengend.title = "DFS_STATUS",
                          legend.labs = c("DiseaseFree", "Recurred")
   )

R Session: Plot DiseaseFree vs Reccured during OS_MONTHS

  clinicalData <- cgdsr::getClinicalData(cgds, "gbm_tcga_pub_all")
start_time <- Sys.time()
  clinicalData %>% 
  mutate(OS_STATUS = gsub("LIVING", "0", OS_STATUS)) %>%
  mutate(OS_STATUS = gsub( "DECEASED", "1", OS_STATUS)) %>%
  mutate(DFS_STATUS = gsub( "^$|^ $", "DiseaseFree", DFS_STATUS)) %>%
  mutate(OS_STATUS = as.numeric(OS_STATUS)) %>%
  arrange(OS_MONTHS) %>%
  mutate( DiseaseFree = ifelse(DFS_STATUS == "DiseaseFree", 1, 0)) %>% 
  as.data.frame() %>%
  mutate(n_DiseaseFree = cumsum(DiseaseFree == 1)) %>%
  mutate(n_Recurred = cumsum(DiseaseFree == 0)) %>%
  ggplot(aes(x = OS_MONTHS, y = value, color = variable)) +
  geom_point(aes(y = n_DiseaseFree, col = "n_DiseaseFree")) +
  geom_point(aes(y = n_Recurred, col = "n_Recurred")) +
  labs(title = paste("Using R Session, Running time = ", Sys.time() - start_time))

Spark Node: Plot DiseaseFree vs Reccured during OS_MONTHS

 clinicalData <- cgdsr::getClinicalData(cgds, "gbm_tcga_pub_all")
 sc <- spark_connect(master = "local",
                     version = "2.4.0")
Re-using existing Spark connection to local
 clinicalData_tbl <- dplyr::copy_to(sc, clinicalData, overwrite = TRUE)
  start_time <- Sys.time()
  clinicalData_tbl %>%
  mutate(OS_STATUS = regexp_replace(OS_STATUS, "LIVING", "0")) %>%
  mutate(OS_STATUS = regexp_replace(OS_STATUS, "DECEASED", "1")) %>%
  mutate(DFS_STATUS = regexp_replace(DFS_STATUS, "^$|^ $", "DiseaseFree")) %>%
  mutate(OS_STATUS = as.numeric(OS_STATUS)) %>%
  #mutate(OS_STATUS = regexp_replace(as.numeric(OS_STATUS), 'NaN', NA)) %>%
  #mutate(OS_STATUS = regexp_replace(OS_STATUS, NaN, NA)) %>%
  #na.replace('') %>%  ## not good for OS_STATUS (0,1)
  #dplyr::filter(!is.na(OS_MONTHS)) 
  arrange(is.na(OS_MONTHS), OS_MONTHS) %>%  ## OUFFF put Nan at the end of the column
  mutate(DiseaseFree = ifelse(DFS_STATUS == "DiseaseFree", 1, 0)) %>% 
  as.data.frame() %>%
  mutate( n_DiseaseFree = cumsum(as.numeric(DiseaseFree == 1 ))) %>%
  mutate( n_Recurred = cumsum(as.numeric(DiseaseFree == 0 ))) %>%
  ggplot(aes(x = OS_MONTHS, y = value, color = variable)) +
  geom_point(aes(y = n_DiseaseFree, col = "n_DiseaseFree")) +
  geom_point(aes(y = n_Recurred, col = "n_Recurred"))  +
   labs(title = paste("Using Spark Node, Running time = ", Sys.time() - start_time))

LS0tCnRpdGxlOiAic3Vydml2YWwgcGxvdCBzZXNzaW9uIFZTIHNwYXJrIG5vZGUiCmRhdGU6ICdgciBTeXMuRGF0ZSgpYCcKb3V0cHV0OiAKICAgIGh0bWxfZG9jdW1lbnQ6CiAgICBudW1iZXJfc2VjdGlvbnM6IHRydWUKICAgIGZpZ19jYXB0aW9uOiB0cnVlCiAgICB0b2M6IHRydWUKICAgIGZpZ193aWR0aDogNwogICAgZmlnX2hlaWdodDogNi41CiAgICB0aGVtZTogY29zbW8KICAgIGhpZ2hsaWdodDogdGFuZ28KICAgIGNvZGVfZm9sZGluZzogaGlkZQotLS0KCiMjIGxvYWQgbGlicmFyaWVzCmBgYHtyfQpsaWJyYXJ5KHN1cnZpdmFsKQpsaWJyYXJ5KHN1cnZtaW5lcikKbGlicmFyeShjZ2RzcikKbGlicmFyeShzcGFya2x5cikKbGlicmFyeShkcGx5cikKYGBgCgojICBSIFNlc3Npb24gUHJvY2Vzc2luZwoKIyMgTG9hZCBjbGluaWNhbCBEYXRhCmBgYHtyfQpjZ2RzIDwtIGNnZHNyOjpDR0RTKCJodHRwOi8vd3d3LmNiaW9wb3J0YWwub3JnL3B1YmxpYy1wb3J0YWwvIikKI1N0dWRpZXM8LSBjZ2Rzcjo6Z2V0Q2FuY2VyU3R1ZGllcyhjZ2RzKQpjbGluaWNhbERhdGEgPC0gY2dkc3I6OmdldENsaW5pY2FsRGF0YShjZ2RzLCAiZ2JtX3RjZ2FfcHViX2FsbCIpCgojY2xpbmljYWxEYXRhIDwtIHJlYWQuY3N2KCJDbGluaWNhbF90YWIuY3N2IikgIywgbmEuc3RyaW5ncz1jKCIiLCJOQSIpCgoKCmBgYAoKCiMjIFRyYW5zZm9ybWF0aW9ucyAxCgpgYGB7cn0KY2xpbmljYWxEYXRhJE9TX1NUQVRVUyA8LSBnc3ViKCJMSVZJTkciLCAiMCIsIGNsaW5pY2FsRGF0YSRPU19TVEFUVVMsIGlnbm9yZS5jYXNlID0gVFJVRSkKY2xpbmljYWxEYXRhJE9TX1NUQVRVUyA8LSBnc3ViKCJERUNFQVNFRCIsICIxIiwgY2xpbmljYWxEYXRhJE9TX1NUQVRVUywgaWdub3JlLmNhc2UgPSBUUlVFKQpjbGluaWNhbERhdGEkREZTX1NUQVRVUyA8LSBnc3ViKCJeJHxeICQiLCAiRGlzZWFzZUZyZWUiLCBjbGluaWNhbERhdGEkREZTX1NUQVRVUywgaWdub3JlLmNhc2UgPSBUUlVFKQpjbGluaWNhbERhdGEkT1NfU1RBVFVTIDwtIGFzLm51bWVyaWMoY2xpbmljYWxEYXRhJE9TX1NUQVRVUykKCmBgYAoKIyMgc3Vydml2YWwgcGxvdAoKYGBge3J9CmZpdCA8LSBzdXJ2aXZhbDo6c3VydmZpdChTdXJ2KE9TX01PTlRIUywgT1NfU1RBVFVTKSB+IERGU19TVEFUVVMsIGRhdGEgPSBjbGluaWNhbERhdGEpCiAgIHN1cnZtaW5lcjo6Z2dzdXJ2cGxvdChmaXQsIGRhdGEgPSBjbGluaWNhbERhdGEsCiAgICAgICAgICAgICAgICAgICAgICAgICAgdHlwZSA9ICJrYXBsYW4tbWVpZXIiLAogICAgICAgICAgICAgICAgICAgICAgICAgICNjb25mLnR5cGU9ImxvZyIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgY29uZi5pbnQgPSBUUlVFLAogICAgICAgICAgICAgICAgICAgICAgICAgIHB2YWwgPSBUUlVFLAogICAgICAgICAgICAgICAgICAgICAgICAgIGZ1biA9ICJwY3QiLAogICAgICAgICAgICAgICAgICAgICAgICAgIHJpc2sudGFibGUgPSBUUlVFLAogICAgICAgICAgICAgICAgICAgICAgICAgIHNpemUgPSAxLAogICAgICAgICAgICAgICAgICAgICAgICAgIGxpbmV0eXBlID0gInN0cmF0YSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgcGFsZXR0ZSA9IGMoIiNFN0I4MDAiLCAiIzJFOUZERiIpLAogICAgICAgICAgICAgICAgICAgICAgICAgIGxlZ2VuZCA9ICJ0b3AiLAogICAgICAgICAgICAgICAgICAgICAgICAgIGxlbmdlbmQudGl0bGUgPSAiREZTX1NUQVRVUyIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgbGVnZW5kLmxhYnMgPSBjKCJEaXNlYXNlRnJlZSIsICJSZWN1cnJlZCIpCiAgICkKYGBgCgoKIyMgUiBTZXNzaW9uOiBQbG90IERpc2Vhc2VGcmVlIHZzIFJlY2N1cmVkIGR1cmluZyBPU19NT05USFMKYGBge3J9CiAgY2xpbmljYWxEYXRhIDwtIGNnZHNyOjpnZXRDbGluaWNhbERhdGEoY2dkcywgImdibV90Y2dhX3B1Yl9hbGwiKQpzdGFydF90aW1lIDwtIFN5cy50aW1lKCkKICBjbGluaWNhbERhdGEgJT4lIAogIG11dGF0ZShPU19TVEFUVVMgPSBnc3ViKCJMSVZJTkciLCAiMCIsIE9TX1NUQVRVUykpICU+JQogIG11dGF0ZShPU19TVEFUVVMgPSBnc3ViKCAiREVDRUFTRUQiLCAiMSIsIE9TX1NUQVRVUykpICU+JQogIG11dGF0ZShERlNfU1RBVFVTID0gZ3N1YiggIl4kfF4gJCIsICJEaXNlYXNlRnJlZSIsIERGU19TVEFUVVMpKSAlPiUKICBtdXRhdGUoT1NfU1RBVFVTID0gYXMubnVtZXJpYyhPU19TVEFUVVMpKSAlPiUKICBhcnJhbmdlKE9TX01PTlRIUykgJT4lCiAgbXV0YXRlKCBEaXNlYXNlRnJlZSA9IGlmZWxzZShERlNfU1RBVFVTID09ICJEaXNlYXNlRnJlZSIsIDEsIDApKSAlPiUgCiAgYXMuZGF0YS5mcmFtZSgpICU+JQogIG11dGF0ZShuX0Rpc2Vhc2VGcmVlID0gY3Vtc3VtKERpc2Vhc2VGcmVlID09IDEpKSAlPiUKICBtdXRhdGUobl9SZWN1cnJlZCA9IGN1bXN1bShEaXNlYXNlRnJlZSA9PSAwKSkgJT4lCiAgZ2dwbG90KGFlcyh4ID0gT1NfTU9OVEhTLCB5ID0gdmFsdWUsIGNvbG9yID0gdmFyaWFibGUpKSArCiAgZ2VvbV9wb2ludChhZXMoeSA9IG5fRGlzZWFzZUZyZWUsIGNvbCA9ICJuX0Rpc2Vhc2VGcmVlIikpICsKICBnZW9tX3BvaW50KGFlcyh5ID0gbl9SZWN1cnJlZCwgY29sID0gIm5fUmVjdXJyZWQiKSkgKwogIGxhYnModGl0bGUgPSBwYXN0ZSgiVXNpbmcgUiBTZXNzaW9uLCBSdW5uaW5nIHRpbWUgPSAiLCBTeXMudGltZSgpIC0gc3RhcnRfdGltZSkpCmBgYAoKIyMgU3BhcmsgTm9kZTogUGxvdCBEaXNlYXNlRnJlZSB2cyBSZWNjdXJlZCBkdXJpbmcgT1NfTU9OVEhTIApgYGB7cn0KIGNsaW5pY2FsRGF0YSA8LSBjZ2Rzcjo6Z2V0Q2xpbmljYWxEYXRhKGNnZHMsICJnYm1fdGNnYV9wdWJfYWxsIikKIHNjIDwtIHNwYXJrX2Nvbm5lY3QobWFzdGVyID0gImxvY2FsIiwKICAgICAgICAgICAgICAgICAgICAgdmVyc2lvbiA9ICIyLjQuMCIpCgogY2xpbmljYWxEYXRhX3RibCA8LSBkcGx5cjo6Y29weV90byhzYywgY2xpbmljYWxEYXRhLCBvdmVyd3JpdGUgPSBUUlVFKQogIHN0YXJ0X3RpbWUgPC0gU3lzLnRpbWUoKQogIGNsaW5pY2FsRGF0YV90YmwgJT4lCiAgbXV0YXRlKE9TX1NUQVRVUyA9IHJlZ2V4cF9yZXBsYWNlKE9TX1NUQVRVUywgIkxJVklORyIsICIwIikpICU+JQogIG11dGF0ZShPU19TVEFUVVMgPSByZWdleHBfcmVwbGFjZShPU19TVEFUVVMsICJERUNFQVNFRCIsICIxIikpICU+JQogIG11dGF0ZShERlNfU1RBVFVTID0gcmVnZXhwX3JlcGxhY2UoREZTX1NUQVRVUywgIl4kfF4gJCIsICJEaXNlYXNlRnJlZSIpKSAlPiUKICBtdXRhdGUoT1NfU1RBVFVTID0gYXMubnVtZXJpYyhPU19TVEFUVVMpKSAlPiUKICAjbXV0YXRlKE9TX1NUQVRVUyA9IHJlZ2V4cF9yZXBsYWNlKGFzLm51bWVyaWMoT1NfU1RBVFVTKSwgJ05hTicsIE5BKSkgJT4lCiAgI211dGF0ZShPU19TVEFUVVMgPSByZWdleHBfcmVwbGFjZShPU19TVEFUVVMsIE5hTiwgTkEpKSAlPiUKICAjbmEucmVwbGFjZSgnJykgJT4lICAjIyBub3QgZ29vZCBmb3IgT1NfU1RBVFVTICgwLDEpCiAgI2RwbHlyOjpmaWx0ZXIoIWlzLm5hKE9TX01PTlRIUykpIAogIGFycmFuZ2UoaXMubmEoT1NfTU9OVEhTKSwgT1NfTU9OVEhTKSAlPiUgICMjIE9VRkZGIHB1dCBOYW4gYXQgdGhlIGVuZCBvZiB0aGUgY29sdW1uCiAgbXV0YXRlKERpc2Vhc2VGcmVlID0gaWZlbHNlKERGU19TVEFUVVMgPT0gIkRpc2Vhc2VGcmVlIiwgMSwgMCkpICU+JSAKICBhcy5kYXRhLmZyYW1lKCkgJT4lCiAgbXV0YXRlKCBuX0Rpc2Vhc2VGcmVlID0gY3Vtc3VtKGFzLm51bWVyaWMoRGlzZWFzZUZyZWUgPT0gMSApKSkgJT4lCiAgbXV0YXRlKCBuX1JlY3VycmVkID0gY3Vtc3VtKGFzLm51bWVyaWMoRGlzZWFzZUZyZWUgPT0gMCApKSkgJT4lCiAgZ2dwbG90KGFlcyh4ID0gT1NfTU9OVEhTLCB5ID0gdmFsdWUsIGNvbG9yID0gdmFyaWFibGUpKSArCiAgZ2VvbV9wb2ludChhZXMoeSA9IG5fRGlzZWFzZUZyZWUsIGNvbCA9ICJuX0Rpc2Vhc2VGcmVlIikpICsKICBnZW9tX3BvaW50KGFlcyh5ID0gbl9SZWN1cnJlZCwgY29sID0gIm5fUmVjdXJyZWQiKSkgICsKICAgbGFicyh0aXRsZSA9IHBhc3RlKCJVc2luZyBTcGFyayBOb2RlLCBSdW5uaW5nIHRpbWUgPSAiLCBTeXMudGltZSgpIC0gc3RhcnRfdGltZSkpCmBgYAoK